In [2]:
# Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
In [3]:
# load the iris datasets
# for info on this dataset, refer to the logistic_regression script
dataset = datasets.load_iris()
In [4]:
#Let us now build a pandas dataframe hosting the data at hand
# We first need the list of feature names for our columns
# It is already stored in the dataset. Let's use it
lfeat = dataset.feature_names
In [5]:
# We now build the Dataframe, with the data as argument
# and the list of column names as keyword argument
df_iris = pd.DataFrame(dataset.data, columns = lfeat)
In [7]:
print "Printing data up to the 5th sample"
df_iris.iloc[:5,:] # Look at the first 5 samples for all features.
Out[7]:
In [8]:
# We also want to add the regression target
# Let's create a new column :
df_iris["Species"] = dataset.target # Must have the correct size of course
In [9]:
#Let's review our complete dataframe:
print
print "Printing data up to the 5th sample"
print "Also print the target"
df_iris.iloc[:5,:] # Look at the first 5 samples for all features incuding target
Out[9]:
In [10]:
# we are now going to fit a Decision Tree model to the data
# Let's use an example to understand what decision trees do
# Picture a doctor and his sick patient
# The doctor follows a protocol to find out what ails the patient
# He may ask : how old are you, where does it hurt and so on
# This will allow him to narrow down the options and eventually
# find out the problem
# Decision trees proceed in the same way :
# They make a series of separation in the feature space
# e.g. if feat1 > c => classify as class 1
# The features on which to make the separation and the threshold value
# are learnt on the training data by optimising a criterion like minimising the classification error at each split
#As before, we create an instance of the model
model = DecisionTreeClassifier()
In [11]:
# Which we then fit to the training data X, Y
# with pandas we have to split the df in two :
# the feature part (X) and the target part (Y)
# This is done below :
data = df_iris[lfeat].values
target = df_iris["Species"].values
model.fit(data, target)
print(model)
In [12]:
# make predictions
# as before, we can use the model to make predictions on any data
expected = target
predicted = model.predict(data)
# and evaluate the performance of the classification with standard metrics
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
In [ ]:
# This time we can see we got a perfect prediction - again
An example of plotting a
http://scikit-learn.org/stable/auto_examples/tree/plot_iris.html
In [19]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
# Parameters
n_classes = 3
plot_colors = "bry"
plot_step = 0.02
# Load data
iris = load_iris()
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
[1, 2], [1, 3], [2, 3]]):
# We only take the two corresponding features
X = iris.data[:, pair]
y = iris.target
# Shuffle
idx = np.arange(X.shape[0])
np.random.seed(13)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]
# Standardize
mean = X.mean(axis=0)
std = X.std(axis=0)
X = (X - mean) / std
# Train
clf = DecisionTreeClassifier().fit(X, y)
# Plot the decision boundary
plt.subplot(2, 3, pairidx + 1)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
plt.xlabel(iris.feature_names[pair[0]])
plt.ylabel(iris.feature_names[pair[1]])
plt.axis("tight")
# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y == i)
plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
cmap=plt.cm.Paired)
plt.axis("tight")
plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
In [ ]: